# Code to take small RNAseq data and assess crRNA processing
# python crRNAfigureMaker.py path_to_fasta keyword_to_select_file(s)_for_analysis

# Import biological and plotting parameters
with open('crRNAfigureMaker_params.txt', mode='rU') as params:
	cr_repeat = params.readline().strip()
	ileTRNA = params.readline().strip()
	yaxisHeight = float(params.readline().strip())

import operator
from itertools import imap
import sys, os
path = sys.argv[1]
mut = sys.argv[2]
dic = open('spacerEnds.dict',mode='rU') # Spacer dictionary to verify reads

def reverseComplement(sequence):
    return sequence[::-1].upper().replace('A','t').replace('G','c').replace('C','g').replace('T','a').upper()
def sdist(S1,S2):
    ''' input two strings. returns the number of differences'''
    return sum(imap(operator.ne,S1,S2))

# Make a dict of reference tRNA Kmers for distinguishing reference reads from the rest
kmerSize = 25 # This is how many bases of the reference must be present for read to be included
window = 1
tRNA_kmers = {}
for i in range(0,len(ileTRNA)-kmerSize+1,window):
    s = ileTRNA[i:i+kmerSize]
    if s not in tRNA_kmers:
        tRNA_kmers[s] = 0
        
# Find the 5' end of the CRISPR repeat in every read
rep_start = cr_repeat[:5]

spacer_tails = eval(dic.read()) # This dictionary has the last five bases of
L = len(spacer_tails.keys()[0]) # every spacer of the CRISPR system so that short
dic.close()                     # crRNAs can be verified. These are 5 bases long

list_of_trna = []
list_of_processed = []
for (a,b,f) in os.walk(path):
    for files in f:
        if '.fasta' not in files: continue
        if mut not in files:    continue
        fastaFile = open(path+files, mode='rU')
        print 'Adding tRNA or crRNA containing reads from '+files+' to lists'
        for line in fastaFile:
            if '>' in line: continue
            read = line.strip()
            rep_index = read.find(rep_start)
            if rep_index > 0:
                # Trim everything before the start of the CRISPR repeat
                processed_repeat = read[read.find(rep_start):]
                
                # Verify that spacer is present if read is short
                if len(processed_repeat) < 12:
                    spacer_nubbin = read[read.find(rep_start)-L:read.find(rep_start)]
                    if spacer_nubbin not in spacer_tails:
                        continue
                    
                # Is the processed_repeat a left-anchored substring of the CRISPR repeat?
                if sdist(cr_repeat,processed_repeat) < 1:
                    list_of_processed.append((processed_repeat,len(processed_repeat)))
                    continue            

            # Now filter through your dataset to isolate reference matching reads
            for kmer in tRNA_kmers:
                if kmer in read:
                    list_of_trna.append(read)
                    break                
        fastaFile.close()
        
denominator = float(len(list_of_trna))
print 'There are '+str(denominator)+' reference sequences in your dataset'
print 'There are '+str(len(list_of_processed))+' crRNA sequences in your dataset'

# Now create a histogram of 3' ends
lengths = []
for pr in list_of_processed:
    crrna =  pr[0]
    length = pr[1]
    lengths.append(length)

# PLOTTING DATA
import numpy as np
from pylab import *
import matplotlib.pyplot as plt

index = np.arange(len(cr_repeat)+1)
x_labels = [' ']+[base for base in cr_repeat]
bins = range(len(cr_repeat)+1)

h,b = np.histogram(lengths,bins,normed=False)
NormH = [float(a)/denominator for a in h]
width = 0.9*(b[1]-b[0])
center = (b[:-1] + b[1:])/2

figure#(figsize=(20,20))
plt.bar(center,NormH,align='center',width=width)
plt.axis([0,len(cr_repeat)+1, 0, yaxisHeight]) ## CHANGE THIS AS NEEDED FOR FIGS
#font = {'size':20}
plt.xlabel('CRISPR repeat')#,**font)
plt.ylabel('Normalized Count')#,**font)
plt.xticks(index, x_labels)#,**font)
#plt.yticks(**font)
plt.show()
